/*

     Fully connected layer

     For forward pass :

     FC_layer will keep a copy of its input
     Infer function will create output and write them straight to next layer's input pointer

     We keep our inputs to make backpropagation easier


*/

float RandomNumber(float Min, float Max)
{
    return ((float(rand()) / float(RAND_MAX)) * (Max - Min)) + Min;
}

float dtanh(float x)
{
   return (1.0f - (x * x) );
};

/*
   PQ CORDIC tanh,

   note that CORDIC algorithm is limited to only work for input range of -1.1182 to 1.1182.

  This is a limitation of the CORDIC algorithm

*/

float cordic_tanh(float x)
{
    volatile float tmp0;
    volatile float tmp1;
    volatile uint32_t x_val = 0x13333333;

    POWERQUAD->CORDIC_Y = 0x0;
    POWERQUAD->CORDIC_X = x_val;
    POWERQUAD->CORDIC_Z = (int32_t) (x * 0x07ffffff);
    POWERQUAD->CONTROL = (CP_CORDIC << 4) |  CORDIC_T(0) | CORDIC_MIU(1) | CORDIC_ITER(kPQ_Iteration_24);    //
    tmp0 = (float)   (int32_t) POWERQUAD->CORDIC_Y;
    tmp1 = (float)   (int32_t) POWERQUAD->CORDIC_X;

    return (tmp0/tmp1);
};
/*

PQ Coproc tanh

*/


inline float cp_tanh(float x)
{
    float etox;
    float etonx;
    float x1, x2;

    _pq_etox0 (* (int32_t *) &x);
    _pq_etonx1 (* (int32_t *) &x);
    * (int32_t *) &etox  = _pq_readMult0();
    * (int32_t *) &etonx = _pq_readMult1();

    x1 = etox + etonx;
    _pq_inv0(* (int32_t *) &x1);
    x2 = etox - etonx;
    * (int32_t *) &x1  = _pq_readMult0();

    return (x1*x2) ;
};



template <uint32_t dim_in, uint32_t dim_out>
class FC_layer
{
    public :
        float weights [dim_in][dim_out];  /* 本层的权重矩阵. */
        float biases  [dim_out];          /* 本层的偏置向量. */
        float dW      [dim_in][dim_out];  /* 保存即将要更新到权重矩阵的学习增量. */
        float dB      [dim_out];          /* 保存即将要更新的偏置向量的学习增量. */
        float dO      [dim_out];          /* 预测输出偏差, 就是预测输出同实际输出的差值, delta_output. */
        float val_in  [dim_in];

        void randomize()
        {
            for (int i = 0; i < dim_in; i++)
            {
                for (int j =0; j < dim_out; j++)
                {
                    weights[i][j] = RandomNumber(-1.0,1.0);
                }
            }

            for (int j = 0; j < dim_out; j++)
            {
                biases[j] = RandomNumber(-1.0,1.0);
            }
        }

        /* 清空之前学习的中间计算结果
         * 增量用过一次之后就不要再用了
         */
        void clearGradients()
        {
            for (int i = 0; i < dim_in; i++)
            {
                for (int j = 0; j < dim_out; j++)
                {
                    dW[i][j] = 0;
                }
            }
            for (int j = 0; j < dim_out; j++)
            {
                dB[j] = 0;
            }
        }

        // constructor
        FC_layer()
        {
            clearGradients();
            randomize();
        }


        // methods for forward and back pass
        // forward pass, generates output
        /* 正向推导.
         * 需要外部人工填充layer.val_in[]数组作为输入, 结合本层的权重矩阵和偏置向量, 得到输出直接通过本函数返回. */
        void infer(float * pRes)
        {
            float tmp;

            for (int j =0;j<dim_out;j++)
            {
                tmp = biases[j];
                for (int i =0;i<dim_in;i++)
                {
                    tmp +=  weights[i][j]*val_in[i];
                }
                pRes[j] = cp_tanh(tmp); /* 激活函数. */
            }
        }

        // backpass, takes in gradients from next layer and generates gradients to pass to next layer
        /* 反向求导.
         * 传入本层, pPreviousOut是本层在正向推理时计算得到的输出. 原则上不需要外部传参, 通过内部的val_in和权重矩阵偏置向量就已经能够重现出来,
         * 但实际上, 本层推理的输出值已经保存在后续层的输入数据中了, 因此直接引用下级层的val_in即可, 这样就将各层串起来了.
         * pGradientToProp是期望前级调整的值. 也就是说, 如果想要本级的输出偏差delta output为0, 需要在本级的输入调整多少.
         */
        void backprop(float * pPreviousOut, float * pGradientToProp=NULL)
        {
            float tmp[dim_in];
            float grad_in;

            for (int i = 0; i < dim_in; i++)
            {
                tmp[i] = 0;
            }

            for (int j = 0; j < dim_out; j++)
            {
                /* 正向传播斜率(导数)的倒数, 也就是反向传播的斜率.
                 * grad_in的意思就是向in数据grad多少.
                 */
                grad_in = dO[j] * dtanh(pPreviousOut[j]); /* 在输出点上的梯度 */

                for (int i = 0; i < dim_in; i++)
                {
                    dW[i][j] -= val_in[i] * grad_in; /* 调整权重的增量. */

                    if (pGradientToProp != NULL)
                    {
                        tmp[i] -= weights[i][j]*grad_in; /* 权重的二阶导数? */
                    }
                }
                dB[j] -= grad_in; /* 调整偏置? 为什么不像权重一样乘以val_in, 偏置的作用是在激活函数之前啊. 对的, 偏置是常量, 没有经过x, 而是直接作用在斜率上的. */
            }

            if (pGradientToProp != NULL)
            {
                for (int i =0;i<dim_in;i++)
                {
                    pGradientToProp[i] = tmp[i];
                }
            }
        }

        void updateModel(float lr)
        {
            for (int j =0;j<dim_out;j++)
            {
                biases[j] -= dB[j] * lr;
                for (int i =0;i<dim_in;i++)
                {
                    weights[i][j] -= dW[i][j] * lr ;
                }
            }
            clearGradients();
        }

};



/* Instantiate NN */

FC_layer<30,16> layer0;
FC_layer<16,8>  layer1;
FC_layer<8,8>   layer2;
FC_layer<8,30>  layer3;



extern "C" {

volatile float total_cost;
volatile float calc_res;

int main(void)
{
    /* Init board hardware. */
    /* attach main clock divide to FLEXCOMM0 (debug console) */
    CLOCK_AttachClk(BOARD_DEBUG_UART_CLK_ATTACH);
    CLOCK_AttachClk(kMAIN_CLK_to_CLKOUT);
    CLOCK_SetClkDiv(kCLOCK_DivClkOut,125,false);

    BOARD_InitPins();
    BOARD_BootClockFROHF96M();
    BOARD_InitDebugConsole();

    PRINTF("hello autoencode.\r\n");

    PQ_Init(POWERQUAD);
    pq_config_t pq_cfg;
    pq_cfg.inputAFormat   = kPQ_Float;
    pq_cfg.inputAPrescale = 0;
    pq_cfg.inputBFormat   = kPQ_Float;
    pq_cfg.inputBPrescale = 0;
    pq_cfg.tmpFormat      = kPQ_Float;
    pq_cfg.tmpPrescale    = 0;
    pq_cfg.outputFormat   = kPQ_Float;
    pq_cfg.outputPrescale = 2;
    pq_cfg.tmpBase        = (uint32_t *)0xe0000000;
    pq_cfg.machineFormat  = kPQ_Float;
    PQ_SetConfig(POWERQUAD, &pq_cfg);

    PQ_VectorDotProduct(POWERQUAD, 5, op_a, op_b, &res);
    PQ_WaitDone(POWERQUAD);

    calc_res = cp_tanh(0.4654);

    float res[30];

    uint32_t index=0;
    for (int epoch = 0; epoch < 5000; epoch++)
    {
        index++;
        if (index == 200)
        {
            index = 0;
        }

        /* 正向推理过程. */
        for (int i = 0; i < 30; i++)
        {
            layer0.val_in[i] = input_data[(index*30)+i];
        }
        layer0.infer(layer1.val_in);
        layer1.infer(layer2.val_in);
        layer2.infer(layer3.val_in);
        layer3.infer(res);

        /* 反向学习过程. */
        total_cost= 0;
        for (int i = 0; i < 30; i++)
        {
            layer3.dO[i] = layer0.val_in[i] - res[i]; /* 推理的结果与期望值的差, 产生推理偏差. */
            total_cost += (layer3.dO[i] * layer3.dO[i]);
        }

        /* 反向传递计算偏差. */
        /* 此时传递给layer3的有三组参数, 本层的dO和res是输入, 前一层的dO是输出.
         * 本层对象实例只保存了本层的输入数据, 未保存输出, 所以需要从后一层的输入数组提取.
         * 这里的意思是, 用本次计算的预测输出和被查出来的偏差, 反推前一级需要做多少调整才能让本层的计算消除偏差.
         */
        layer3.backprop(res           ,layer2.dO);
        layer2.backprop(layer3.val_in ,layer1.dO);
        layer1.backprop(layer2.val_in ,layer0.dO);
        layer0.backprop(layer1.val_in);

        if (index % 10 == 0)
        {
            layer3.updateModel(0.005);
            layer2.updateModel(0.005);
            layer1.updateModel(0.005);
            layer0.updateModel(0.005);
        }
    }

    while(1)
    {
        __WFI();
    }
}

}; /* extern "C" */
